BAP treated hESC (day 8) snRNAseq data analyses (v2)
snRNAseq data analyses
Prerequisites
This experiment has 2 conditions (BAP treated hESC exposed to 20% oxygen and 5% oxygen), with 2 replicates each. The details are provided in the manuscript. The packages that are needed for the analyses were loaded as below. If you need the version information, session information is printed at the bottom of this wiki.
setwd("/work/LAS/geetu-lab/arnstrm/timecourse/5_cr-count.o2stress/version_1")
# load the modules
library(Seurat)
library(knitr)
library(kableExtra)
library(ggplot2)
library(cowplot)
library(patchwork)
library(metap)
library(multtest)
library(gridExtra)
library(dplyr)
library(stringr)
library(TissueEnrich)
library(gprofiler2)
library(tidyverse)
library(enhancedDimPlot)
library(calibrate)
library(ggrepel)
library(dittoSeq)
library(ComplexHeatmap)
library(scales)
library(ggvenn)
library(plotly)
library(DT)
library(ape)
library(enrichR)
library(SeuratWrappers)Importing 10x Datasets
The 10X data was already processed with CellRanger (v.4.0.0) and the counts table was ready for us to import for the data analyses. We used the inbuilt function to import this data and to create a Seurat object as described below.
experiment_name = "BAP"
dataset_loc <- "/work/LAS/geetu-lab/arnstrm/timecourse/1_data/input_v6.1.2/with_introns"
ids <- c("5pcO2_r1", "5pcO2_r2", "20pcO2_r1", "20pcO2_r2")
# function d10x.data
d10x.data <- sapply(ids, function(i){
d10x <- Read10X(file.path(dataset_loc,i,"filtered_feature_bc_matrix"))
colnames(d10x) <- paste(sapply(strsplit(colnames(d10x),split="-"), '[[' , 1L ), i, sep="-")
d10x
})
experiment.data <- do.call("cbind", d10x.data)
bapd8.combined <- CreateSeuratObject(
experiment.data,
project = "BAPd8",
min.cells = 10,
min.genes = 200,
names.field = 2,
names.delim = "\\-")
# backup the object
bapd8.temp <- bapd8.combinedData quality insepction
After the data was imported, we checked the quality of the data. Mitochondrial expression is an important criteria (along with other quantitative features of each nuclei) to decide if the nucleus is good or bad. We tested it as follows
MT ratio in nucleus
bapd8.combined$log10GenesPerUMI <- log10(bapd8.combined$nFeature_RNA) / log10(bapd8.combined$nCount_RNA)
bapd8.combined$mitoRatio <- PercentageFeatureSet(object = bapd8.combined, pattern = "^MT-")
bapd8.combined$mitoRatio <- bapd8.combined@meta.data$mitoRatio / 100
metadata <- bapd8.combined@meta.data
metadata$cells <- rownames(metadata)
metadata <- metadata %>%
dplyr::rename(seq_folder = orig.ident,
nUMI = nCount_RNA,
nGene = nFeature_RNA,
seq_folder = orig.ident)
p <- ggplot(dat = metadata, aes(x=nUMI, y=nGene, color=mitoRatio)) +
geom_point(alpha = 0.5) +
scale_colour_gradient(low = "gray90", high = "black") + labs(colour="MT ratio") +
theme_bw(base_size = 12) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_rect(colour = "black")) +
xlab("RNA counts") + ylab("Gene counts") +
stat_smooth(method=lm) +
facet_wrap(~seq_folder, labeller = labeller(seq_folder =
c("20pcO2_r1" = "20% Oxygen (rep1)",
"20pcO2_r2" = "20% Oxygen (rep2)",
"5pcO2_r1" = "5% Oxygen (rep1)",
"5pcO2_r2" = "5% Oxygen (rep2)"))) +
scale_y_continuous(label=comma) +
scale_x_continuous(label=comma)
pRelationship between the total molecules detected (transcripts) vs. total genes detected across samples. Each nucleus is represented as a dot, with the color intensity representing the mitochondrial read ratio in that nucleus.
Number of nuclei per sample
ggplot(metadata, aes(x=seq_folder, fill=seq_folder)) +
geom_bar() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
ggtitle("Number of Nuclei")Number of nuclei found in each sample
Density of nuclei per sample
ggplot(metadata, aes(color=seq_folder, x=nUMI, fill= seq_folder)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
ylab("Cell density") +
geom_vline(xintercept = 500)Density of nuclei vs. UMIs
Number of Nuclei vs. genes
ggplot(metadata, aes(x=seq_folder, y=log10(nGene), fill=seq_folder)) +
geom_boxplot() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
ggtitle("NNuclei vs NGenes")Nubmer of nuclei vs. genes
Number of Nuclei vs. genes
ggplot(metadata, aes(x=log10GenesPerUMI, color = seq_folder, fill=seq_folder)) +
geom_density(alpha = 0.2) +
theme_classic() +
geom_vline(xintercept = 0.8)Nubmer of Nuclei vs. genes
Mitochondrial density across samples
ggplot(metadata, aes(color=seq_folder, x=mitoRatio, fill=seq_folder)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
geom_vline(xintercept = 0.2)
#> Warning: Transformation introduced infinite values in continuous x-axis
#> Warning: Removed 17 rows containing non-finite values (stat_density).Mitochondrial density across samples
Data filtering
After inspection, we decided to remove all mitochondiral genes as well as ribosomal genes from our analyses.
set up the metadata file and organize
bapd8.combined <- bapd8.temp
df <- bapd8.combined@meta.data
df$replicate <- NA
df$replicate[which(str_detect(df$orig.ident, "5pcO2"))] <- "5pcO2"
df$replicate[which(str_detect(df$orig.ident, "20pcO2"))] <- "20pcO2"
bapd8.combined@meta.data <- df
bapd8.combined[["percent.mt"]] <- PercentageFeatureSet(bapd8.combined, pattern = "^MT-")
datatable(bapd8.combined@meta.data, rownames = TRUE, filter="top", options = list(pageLength = 15, scrollX=T) )set up the metadata file and organize
v1 <- VlnPlot(bapd8.combined, features = "nFeature_RNA", pt.size = 1) +
geom_hline(yintercept=200, color = "red", size=1) +
geom_hline(yintercept=7500, color = "red", size=1) +
theme(legend.position = "none")
v2 <- VlnPlot(bapd8.combined, features = "nCount_RNA", pt.size = 1) +
theme(legend.position = "none")
v3 <- VlnPlot(bapd8.combined, features = "percent.mt", pt.size = 1) +
geom_hline(yintercept=15, color = "red", size=1) +
theme(legend.position = "none")
v1 | v2 | v3Before filtering
B1 <- FeatureScatter(bapd8.combined, feature1 = "nCount_RNA", feature2 = "percent.mt")
B2 <- FeatureScatter(bapd8.combined, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
B1 | B2Preliminary filtering
bapd8.combined <- subset(bapd8.combined, subset = nFeature_RNA > 200 & nFeature_RNA < 7500 & percent.mt < 25)
I1 <- FeatureScatter(bapd8.combined, feature1 = "nCount_RNA", feature2 = "percent.mt")
I2 <- FeatureScatter(bapd8.combined, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
I1 | I2Final filtering
bapd8.combined <- subset(bapd8.combined, subset = nFeature_RNA > 200 & nFeature_RNA < 7500 & percent.mt < 15)
A1 <- FeatureScatter(bapd8.combined, feature1 = "nCount_RNA", feature2 = "percent.mt")
A2 <- FeatureScatter(bapd8.combined, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
A1 | A2Removing ribosomal and MT proteins
counts <- GetAssayData(object = bapd8.combined, slot = "counts")
counts <- counts[grep(pattern = "^MT", x = rownames(counts), invert = TRUE),]
counts <- counts[grep(pattern = "^MT", x = rownames(counts), invert = TRUE),]
counts <- counts[grep(pattern = "^RPL", x = rownames(counts), invert = TRUE),]
counts <- counts[grep(pattern = "^RPS", x = rownames(counts), invert = TRUE),]
counts <- counts[grep(pattern = "^MRPS", x = rownames(counts), invert = TRUE),]
counts <- counts[grep(pattern = "^MRPL", x = rownames(counts), invert = TRUE),]
keep_genes <- Matrix::rowSums(counts) >= 10
filtered_counts <- counts[keep_genes, ]
bapd8.fcombined <- CreateSeuratObject(filtered_counts, meta.data = bapd8.combined@meta.data)
bapd8.fcombined@meta.data <- bapd8.fcombined@meta.data[1:4]
bapd8.combined <- bapd8.fcombinedData integration and Clustering
Seurat package was used for integrating samples and running the snRNAseq analyses.
data integration
set.seed(1111)
bapd8.list <- SplitObject(bapd8.combined, split.by = "orig.ident")
bapd8.list <- lapply(X = bapd8.list, FUN = function(x) {
x <- NormalizeData(x)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})Seurat
bapd8.anchors <- FindIntegrationAnchors(object.list = bapd8.list, dims = 1:20)
bapd8.integrated <- IntegrateData(anchorset = bapd8.anchors, dims = 1:20)
DefaultAssay(bapd8.integrated) <- "integrated"
bapd8.integrated <- ScaleData(bapd8.integrated, verbose = FALSE)
bapd8.integrated <- RunPCA(bapd8.integrated, npcs = 30, verbose = FALSE)
bapd8.integrated <- RunUMAP(bapd8.integrated, reduction = "pca", dims = 1:20)
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
bapd8.integrated <- FindNeighbors(bapd8.integrated, reduction = "pca", dims = 1:20)
bapd8.integrated <- FindClusters(bapd8.integrated, resolution = 0.5)
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 6428
#> Number of edges: 254094
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8858
#> Number of communities: 11
#> Elapsed time: 0 secondsRenumber the clusters
By default Seurat assigns the cluster identity starting from zero. Since we prefer identity starting from one, we renumbered cluster 0-10 to 1-11.
num.clusters <- nlevels(bapd8.integrated$seurat_clusters)
df <- bapd8.integrated@meta.data
df$new_clusters <- as.factor(as.numeric(df$seurat_clusters))
bapd8.integrated@meta.data <- df
Idents(bapd8.integrated) <- "new_clusters"dimplots (colored based on clusters)
The Dimensional reduction plot was plotted using the Seurat DipPlot function, with colors representing different groups/clusters.
d1 <- enhancedDimPlot(object = bapd8.integrated, grouping_var = 'ident', reduction = "umap", label = TRUE, pt.size = 1, alpha = 0.5) +
ggtitle("A") + xlab("UMAP_1") + ylab("UMAP_2") +
theme_classic() +
theme(legend.position = "none", plot.title = element_text(face = "bold"))
ggplotly(d1)
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issues
#> Warning in geom2trace.default(dots[[1L]][[11L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
#> If you'd like to see this geom implemented,
#> Please open an issue with your example code at
#> https://github.com/ropensci/plotly/issuesFig2A: Dimensional reduction plot showing 5,355 nuclei plotted in two dimensions. The colored dots represent individual nuclei and are assigned based on cluster identity
dimplots (colored based on conditions)
d2 <- enhancedDimPlot(object = bapd8.integrated, grouping_var = 'replicate', reduction = "umap", label = FALSE, pt.size = 1, alpha = 0.4) +
ggtitle("B") +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(legend.justification = c(1, 1), legend.position = c(1, 1), plot.title = element_text(face = "bold")) +
scale_colour_manual(name = "Conditions",
labels = c(expression(paste('20% ', 'O'[2])), expression(paste('5% ', 'O'[2]))),
values = c("20pcO2" = "#0571b0", "5pcO2" = "#ca0020")) +
scale_fill_manual(name = "Conditions",
labels = c(expression(paste('20% ', 'O'[2])), expression(paste('5% ', 'O'[2]))),
values = c("20pcO2" = "#0571b0", "5pcO2" = "#ca0020")) +
scale_linetype_manual(values = "blank")
ggplotly(d2)Fig2B: Dimensional reduction plot showing 5,355 nuclei plotted in two dimensions. The colored dots represent individual nuclei and are assigned based on treatment.
dimplots (colored based on samples)
d3 <- enhancedDimPlot(object = bapd8.integrated, grouping_var = 'orig.ident', reduction = "umap", label = FALSE, pt.size = 1, alpha = 0.4) +
ggtitle("C") +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(legend.justification = c(1, 1), legend.position = c(1, 1), plot.title = element_text(face = "bold")) +
scale_colour_manual(name = "Replicates",
labels = c(expression(paste('20% ', 'O'[2], ' rep1')), expression(paste('20% ', 'O'[2], ' rep2')), expression(paste('5% ', 'O'[2], ' rep1')), expression(paste('5% ', 'O'[2], ' rep1'))),
values = c("20pcO2_r1" = "#0571b0", "20pcO2_r2" = "#92c5de", "5pcO2_r1" = "#ca0020", "5pcO2_r2" = "#f4a582")) +
scale_fill_manual(name = "Replicates",
labels = c(expression(paste('20% ', 'O'[2], ' rep1')), expression(paste('20% ', 'O'[2], ' rep2')), expression(paste('5% ', 'O'[2], ' rep1')), expression(paste('5% ', 'O'[2], ' rep1'))),
values = c("20pcO2_r1" = "#0571b0", "20pcO2_r2" = "#92c5de", "5pcO2_r1" = "#ca0020", "5pcO2_r2" = "#f4a582")) +
scale_linetype_manual(values = "blank")
ggplotly(d3)Fig2C: Dimensional reduction plot showing 5,355 nuclei plotted in two dimensions. The colored dots represent individual nuclei and are assigned based on replicate
Find markers
Find markers for each cluster. The Seurat command FindMarkers was run using the cluster identity assigned in the previous step. Additional column with the Fold (converted from natural log fold change of seurat output) was added to the table. The filtering was done for genes with avg_FC >= 1.5 and p_val_adj <= 0.05.
DefaultAssay(bapd8.integrated) <- "RNA"
lhs.a <- paste("markers.all.", 1:num.clusters, sep="")
rhs.a <- paste("FindMarkers(bapd8.integrated, ident.1 = ",1:num.clusters," )", sep="")
commands.a <- paste(paste(lhs.a, rhs.a, sep="<-"), collapse=";")
eval(parse(text=commands.a))
#markers.all.1$avg_FC <- 2^markers.all.1$avg_log2FC
lhs.b <- paste("markers.all.", 1:num.clusters, "$avg_FC", sep="")
rhs.b <- paste("2^markers.all.",1:num.clusters,"$avg_log2FC", sep="")
commands.b <- paste(paste(lhs.b, rhs.b, sep="<-"), collapse=";")
eval(parse(text=commands.b))
lhs.c <- paste("markers.filtered.", 1:num.clusters, sep="")
rhs.c <- paste("markers.all.",1:num.clusters," %>% filter(avg_FC >= 1.5) %>% filter(p_val_adj <= 0.05) %>% arrange(desc(avg_FC))", sep="")
commands.c <- paste(paste(lhs.c, rhs.c, sep="<-"), collapse=";")
eval(parse(text=commands.c))
lhs.d <- paste("markers.filtered.names.", 1:num.clusters, sep="")
rhs.d <- paste("rownames(markers.filtered.",1:num.clusters,")", sep="")
commands.d <- paste(paste(lhs.d, rhs.d, sep="<-"), collapse=";")
eval(parse(text=commands.d))Check the number of markers
message (paste("Cluster 1 as", length(markers.filtered.names.1), "markers", sep = " "))
message (paste("Cluster 2 as", length(markers.filtered.names.2), "markers", sep = " "))
message (paste("Cluster 3 as", length(markers.filtered.names.3), "markers", sep = " "))
message (paste("Cluster 4 as", length(markers.filtered.names.4), "markers", sep = " "))
message (paste("Cluster 5 as", length(markers.filtered.names.5), "markers", sep = " "))
message (paste("Cluster 6 as", length(markers.filtered.names.6), "markers", sep = " "))
message (paste("Cluster 7 as", length(markers.filtered.names.7), "markers", sep = " "))
message (paste("Cluster 8 as", length(markers.filtered.names.8), "markers", sep = " "))
message (paste("Cluster 9 as", length(markers.filtered.names.9), "markers", sep = " "))
message (paste("Cluster 10 as", length(markers.filtered.names.10), "markers", sep = " "))
message (paste("Cluster 11 as", length(markers.filtered.names.11), "markers", sep = " "))Marker plots
Combined expression of all the markers genes in various clusters. The markers have higher expression in their respecitve cluster than compared to the rest of the clusters.
ggplotColours <- function(n = 6, h = c(0, 360) + 15){
if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360/n
hcl(h = (seq(h[1], h[2], length = n)), c = 100, l = 65)
}
color_list <- ggplotColours(n=11)
grouped_violinPlots <- function(markersfile, clusternumber, seuratobject = bapd8.integrated) {
dittoPlotVarsAcrossGroups(seuratobject, markersfile,
group.by = "new_clusters", main = paste("Cluster ", clusternumber, " markers"),
xlab = "Clusters",
ylab = "Mean z-score expression",
x.labels = c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4",
"Cluster 5", "Cluster 6", "Cluster 7", "Cluster 8",
"Cluster 9", "Cluster 10", "Cluster 11"),
vlnplot.lineweight = 0.5,
legend.show = FALSE,
jitter.size = 0.5,
color.panel = color_list)
}Markers of cluster 1
grouped_violinPlots(markers.filtered.names.1, 1)Markers of cluster 2
grouped_violinPlots(markers.filtered.names.2, 2)Markers of cluster 3
grouped_violinPlots(markers.filtered.names.3, 3)Markers of cluster 4
grouped_violinPlots(markers.filtered.names.4, 4)Markers of cluster 5
grouped_violinPlots(markers.filtered.names.5, 5)Markers of cluster 6
grouped_violinPlots(markers.filtered.names.6, 6)Markers of cluster 7
grouped_violinPlots(markers.filtered.names.7, 7)Markers of cluster 8
grouped_violinPlots(markers.filtered.names.8, 8)Markers of cluster 9
grouped_violinPlots(markers.filtered.names.9, 9)Markers of cluster 10
grouped_violinPlots(markers.filtered.names.10, 10)Markers of cluster 11
grouped_violinPlots(markers.filtered.names.11, 11)Run PlacentaCellEnrich on markers
PlacentaCellEnrich was run command-line using the TissueEnrich R package. We used this to assign cell identity to cluster 2, 3, 5 and 6. The function used for running PCE is as follows:
# Vento-Tormo et al., dataset
input="/work/LAS/geetu-lab/arnstrm/timecourse/1_data/input_v6.1.2/other.info/"
l <-
load(file = paste0(input, "combine-test-expression1.Rdata"))
humanGeneMapping <- dataset$GRCH38$humanGeneMapping
d <- dataset$PlacentaDeciduaBloodData
data <- d$expressionData
cellDetails <- d$cellDetails
# Xiang et al., dataset
te.dataset.xiang <- readRDS(paste0(input, "te.dataset.xiang.rds"))
# Castel et al., dataset
te.dataset.castel <- readRDS(paste0(input, "te.dataset.castel.rds"))
# full names for cell types
xi.md <-
read.csv(
paste0(input, "/md-xi.tsv"),
sep = "\t",
header = TRUE,
row.names = 1
)
vt.md <-
read.csv(
paste0(input, "md-vt.tsv"),
sep = "\t",
header = TRUE,
row.names = 1
)
zp.md <-
read.csv(
paste0(input, "md-zp.tsv"),
sep = "\t",
header = TRUE,
row.names = 1
)
runpce <- function(inputgenelist, barcolor) {
inputGenes <- toupper(inputgenelist)
gs1 <- GeneSet(geneIds = toupper(inputgenelist))
humanGene <-
humanGeneMapping[humanGeneMapping$Gene.name %in% inputGenes, ]
inputGenes <- humanGene$Gene
expressionData <-
data[intersect(row.names(data), humanGeneMapping$Gene), ]
se <-
SummarizedExperiment(
assays = SimpleList(as.matrix(expressionData)),
rowData = row.names(expressionData),
colData = colnames(expressionData)
)
cellSpecificGenesExp <-
teGeneRetrieval(se, expressedGeneThreshold = 1)
gs <- GeneSet(geneIds = toupper(inputGenes))
output.vt <- teEnrichmentCustom(gs, cellSpecificGenesExp)
en.output.vt <-
setNames(data.frame(assay(output.vt[[1]]), row.names = rowData(output.vt[[1]])[, 1]),
colData(output.vt[[1]])[, 1])
row.names(cellDetails) <- cellDetails$RName
en.output.vt$Tissue <-
cellDetails[row.names(en.output.vt), "CellName"]
en.output.vt$source <- "VT"
en.output.vt <- en.output.vt[order(-en.output.vt$Log10PValue),]
en.output.vt <-
merge(en.output.vt, vt.md, by = "row.names", all.x = TRUE)
en.output.vt <- rownames_to_column(en.output.vt, var = "Name")
output.xi <- teEnrichmentCustom(gs1, te.dataset.xiang)
en.output.xi <-
setNames(data.frame(assay(output.xi[[1]]), row.names = rowData(output.xi[[1]])[, 1]),
colData(output.xi[[1]])[, 1])
en.output.xi$Tissue <- rownames(en.output.xi)
en.output.xi$source <- "Xi"
en.output.xi <- en.output.xi[order(-en.output.xi$Log10PValue),]
en.output.xi <-
merge(en.output.xi, xi.md, by = "row.names", all.x = TRUE)
en.output.xi <- rownames_to_column(en.output.xi, var = "Name")
output.zp <- teEnrichmentCustom(gs1, te.dataset.castel)
en.output.zp <-
setNames(data.frame(assay(output.zp[[1]]), row.names = rowData(output.zp[[1]])[, 1]),
colData(output.zp[[1]])[, 1])
en.output.zp$Tissue <- rownames(en.output.zp)
en.output.zp$source <- "ZP"
en.output.zp <- en.output.zp[order(-en.output.zp$Log10PValue),]
en.output.zp <-
merge(en.output.zp, zp.md, by = "row.names", all.x = TRUE)
en.output.zp <- rownames_to_column(en.output.zp, var = "Name")
en.conbined <- rbind(en.output.vt, en.output.xi, en.output.zp)
# p <- 0.05
# logp <- -log10(p)
en.conbined <- en.conbined %>%
# mutate(Log10PValue = replace(Log10PValue, Log10PValue < logp, 0))
# en.conbined %>%
group_by(source) %>%
arrange(source, desc(Log10PValue)) %>% dplyr::slice(1:7) %>%
ungroup %>%
mutate(
source = as.factor(source),
CellNames = tidytext::reorder_within(CellNames, Log10PValue, source, sep = ":")
)
ggplot(en.conbined, aes(CellNames, Log10PValue)) + geom_bar(stat = 'identity', fill = barcolor) + theme_minimal() +
theme(
axis.text.x = element_text(
vjust = 1,
hjust = 1,
size = 10
),
axis.text.y = element_text(size = 8),
plot.margin = margin(10, 10, 10, 100),
legend.position = "none",
plot.title = element_text(
color = "black",
size = 10,
face = "bold.italic"
),
axis.title.y = element_blank(),
axis.line.x = element_line(
colour = 'black',
size = 0.5,
linetype = 'solid'
),
axis.ticks.x = element_line(
colour = 'black',
size = 1,
linetype = 'solid'
),
axis.title.x = element_text(
color = "black",
size = 10,
face = "bold"
)
) +
scale_y_continuous(expand = expansion(mult = c(0, .1)), breaks = pretty_breaks()) + ylab("-log10 p-value") +
facet_wrap( ~ source, scales = "free", ncol = 3) +
coord_flip()
}PCE for cluster 1 markers
runpce(markers.filtered.names.1, color_list[1])PCE for cluster 2 markers
runpce(markers.filtered.names.2, color_list[2])PCE for cluster 3 markers
runpce(markers.filtered.names.3, color_list[3])PCE for cluster 4 markers
runpce(markers.filtered.names.4, color_list[4])PCE for cluster 5 markers
runpce(markers.filtered.names.5, color_list[5])PCE for cluster 6 markers
runpce(markers.filtered.names.6, color_list[6])PCE for cluster 7 markers
runpce(markers.filtered.names.7, color_list[7])PCE for cluster 8 markers
runpce(markers.filtered.names.8, color_list[8])PCE for cluster 9 markers
runpce(markers.filtered.names.9, color_list[9])runpce(markers.filtered.names.10, color_list[10])runpce(markers.filtered.names.11, color_list[11])Inspecting STB clusters (cluster 4 and 8)
# findmarkers
markers.all.4vs8 <- FindMarkers(bapd8.integrated, ident.1 = 4, ident.2 = 8)
markers.all.8vs4 <- FindMarkers(bapd8.integrated, ident.1 = 8, ident.2 = 4)
#convert FC
markers.all.4vs8$avg_FC <- 2^markers.all.4vs8$avg_log2FC
markers.all.8vs4$avg_FC <- 2^markers.all.8vs4$avg_log2FC
#Filter
markers.filtered.4vs8 <- markers.all.4vs8 %>% filter(avg_FC >= 1.5) %>% filter(p_val_adj <= 0.05)
markers.filtered.8vs4 <- markers.all.8vs4 %>% filter(avg_FC >= 1.5) %>% filter(p_val_adj <= 0.05)
#list
markers.filtered.names.4vs8 <- rownames(markers.filtered.4vs8)
markers.filtered.names.8vs4 <- rownames(markers.filtered.8vs4)PCE for cluster 4 markers (vs. 8)
runpce(markers.filtered.names.4vs8, color_list[4])PCE for cluster 8 markers (vs. 4)
runpce(markers.filtered.names.8vs4, color_list[5])Save expression values
write.table(markers.filtered.1, file = "Cluster1_markers.expression.filtered.tsv", sep = "\t")
write.table(markers.filtered.2, file = "Cluster2_markers.expression.filtered.tsv", sep = "\t")
write.table(markers.filtered.3, file = "Cluster3_markers.expression.filtered.tsv", sep = "\t")
write.table(markers.filtered.4, file = "Cluster4_markers.expression.filtered.tsv", sep = "\t")
write.table(markers.filtered.5, file = "Cluster5_markers.expression.filtered.tsv", sep = "\t")
write.table(markers.filtered.6, file = "Cluster6_markers.expression.filtered.tsv", sep = "\t")
write.table(markers.filtered.7, file = "Cluster7_markers.expression.filtered.tsv", sep = "\t")
write.table(markers.filtered.8, file = "Cluster8_markers.expression.filtered.tsv", sep = "\t")
write.table(markers.filtered.9, file = "Cluster9_markers.expression.filtered.tsv", sep = "\t")
write.table(markers.filtered.10, file = "Cluster10_markers.expression.filtered.tsv", sep = "\t")
write.table(markers.filtered.11, file = "Cluster11_markers.expression.filtered.tsv", sep = "\t")
write.table(markers.filtered.4vs8, file = "Cluster4vs8_markers.expression.filtered.tsv", sep = "\t")
write.table(markers.filtered.8vs4, file = "Cluster8vs4_markers.expression.filtered.tsv", sep = "\t")Plotting functions
Some plotting functions that we used for finalizing violin plots.
# https://divingintogeneticsandgenomics.rbind.io/post/stacked-violin-plot-for-visualizing-single-cell-data-in-seurat/
modify_vlnplot<- function(obj,
feature,
pt.size = 0,
plot.margin = unit(c(-0.75, 0, -0.75, 0), "cm"),
...) {
p<- VlnPlot(obj, features = feature, pt.size = pt.size, ... ) +
xlab("") + ylab(feature) + ggtitle("") +
theme(legend.position = "none",
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = rel(1), angle = 0),
axis.text.y = element_text(size = rel(1)),
plot.margin = plot.margin )
return(p)
}
extract_max<- function(p){
ymax<- max(ggplot_build(p)$layout$panel_scales_y[[1]]$range$range)
return(ceiling(ymax))
}
StackedVlnPlot<- function(obj, features,
pt.size = 0,
plot.margin = unit(c(-0.75, 0, -0.75, 0), "cm"),
...) {
plot_list<- purrr::map(features, function(x) modify_vlnplot(obj = obj,feature = x, ...))
# Add back x-axis title to bottom plot. patchwork is going to support this?
plot_list[[length(plot_list)]]<- plot_list[[length(plot_list)]] +
theme(axis.text.x=element_text(), axis.ticks.x = element_line())
# change the y-axis tick to only max value
ymaxs<- purrr::map_dbl(plot_list, extract_max)
plot_list<- purrr::map2(plot_list, ymaxs, function(x,y) x +
scale_y_continuous(breaks = c(y)) +
expand_limits(y = y))
p<- patchwork::wrap_plots(plotlist = plot_list, ncol = 1)
return(p)
}Find all Markers
Find all markers using the in build function of Seurat
bapd8.markers <- FindAllMarkers(bapd8.integrated, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
bapd8.markers.ranked.10.percluster <- bapd8.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
datatable(bapd8.markers.ranked.10.percluster, rownames = FALSE, filter="top", options = list(pageLength = 10, scrollX=T))Find Conserved markers
This is optional. We did not use the marker genes specific for clusters of each condition.
lhs.f <- paste("markers.conserved.", 1:num.clusters, sep="")
rhs.f <- paste("FindConservedMarkers(bapd8.integrated, ident.1 = ",1:num.clusters,', grouping.var = "replicate", verbose = FALSE)', sep="")
commands.f <- paste(paste(lhs.f, rhs.f, sep="<-"), collapse=";")
eval(parse(text=commands.f))Expression tables
To export average expression levels for all genes, summarized based on all cells, each condition and each replicate, we used AverageExpression command for Seurat.
cluster.averages.data <- AverageExpression(bapd8.integrated, slot = "data", assays = "RNA")
condition.averages.data <- AverageExpression(bapd8.integrated, slot = "data", group.by = "orig.ident", assays = "RNA")
replicate.averages.data <- AverageExpression(bapd8.integrated, slot = "data", group.by = "replicate", assays = "RNA")
avg.data <- cluster.averages.data[["RNA"]]
avg.condition <- condition.averages.data[["RNA"]]
avg.replicate <- replicate.averages.data[["RNA"]]
write.table(avg.data, file="snn-average-data.tsv", sep= "\t")
write.table(avg.condition, file="snn-average-condition.tsv", sep= "\t")
write.table(avg.replicate, file="snn-average-replicate.tsv", sep= "\t")
#avg.data$gene <- row.names(avg.data)
#avg.data <- as_tibble(avg.data)
#head(avg.data)
#avg.data <- avg.data[, c(12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)]
#colnames(avg.data) <- c("Gene", paste("Cluster", colnames(avg.data[2:12]), sep = "_"))
#datatable(avg.data, rownames = FALSE, filter="top", options = list(pageLength = 10, scrollX=T))Cell numbers per clusters
Generate a summary table showing the number of cells in each cluster, broken down by replicates and treatment.
cells <- bapd8.integrated@meta.data %>%
dplyr::group_by(orig.ident, new_clusters, replicate) %>%
dplyr::summarise(length(new_clusters)) %>%
dplyr::rename(sample = orig.ident,
cluster = new_clusters,
condition = replicate,
number.of.cells = `length(new_clusters)`)
#head(cells)
#datatable(cells, rownames = TRUE, filter="top", options = list(pageLength = 10, scrollX=T))
#cells %>%
# group_by(orig.ident, new_clusters ) %>%
# summarize(`length(new_clusters)`)
ggplot(cells, aes(x = cluster, y = number.of.cells, fill = cluster )) +
geom_col() +
facet_wrap(~condition) +
theme_classic() +
theme(legend.position = "none")DE between conditions and Volcano plots
The DE was carried out between the conditions for each cluster. First we modified the metadata to create a separate column that has both the condition as well as the cluster number and then we used Seurat’s FindMarkers to find the genes that are differentially expressed. The genes that have log2FC > 1 or <-1 are shown in color if they also have p-val <0.05.
head(bapd8.integrated@meta.data)
#> orig.ident nCount_RNA nFeature_RNA replicate
#> AAACGAAGTGAGACGT-5pcO2_r1 5pcO2_r1 921 687 5pcO2
#> AAACGCTAGCCGATAG-5pcO2_r1 5pcO2_r1 1786 1053 5pcO2
#> AAAGAACAGGCACTAG-5pcO2_r1 5pcO2_r1 1091 573 5pcO2
#> AAAGAACTCATTTCCA-5pcO2_r1 5pcO2_r1 21054 4986 5pcO2
#> AAAGGATTCCGTAGTA-5pcO2_r1 5pcO2_r1 9718 3540 5pcO2
#> AAAGGTAGTAGGGAGG-5pcO2_r1 5pcO2_r1 18482 5317 5pcO2
#> integrated_snn_res.0.5 seurat_clusters new_clusters
#> AAACGAAGTGAGACGT-5pcO2_r1 2 2 3
#> AAACGCTAGCCGATAG-5pcO2_r1 0 0 1
#> AAAGAACAGGCACTAG-5pcO2_r1 2 2 3
#> AAAGAACTCATTTCCA-5pcO2_r1 4 4 5
#> AAAGGATTCCGTAGTA-5pcO2_r1 8 8 9
#> AAAGGTAGTAGGGAGG-5pcO2_r1 2 2 3
df <- bapd8.integrated@meta.data
df$stim <- (paste(df$replicate,df$new_clusters, sep = "."))
df$stim <- gsub('5pcO2', 'FIVE', df$stim)
df$stim <- gsub('20pcO2', 'TWENTY', df$stim)
bapd8.integrated@meta.data <- df
Idents(bapd8.integrated) <- "stim"
lhs.g <- paste("clus", 1:num.clusters, ".five.twenty", sep="")
rhs.g <- paste('FindMarkers(bapd8.integrated, ident.1 = "FIVE.', 1:num.clusters, '", ident.2 = "TWENTY.', 1:num.clusters, '", verbose = FALSE, logfc.threshold = 0)', sep="")
commands.g <- paste(paste(lhs.g, rhs.g, sep="<-"), collapse=";")
eval(parse(text=commands.g))
# lhs.h <- paste("clus", 1:num.clusters, ".five.twenty$log2fc", sep="")
# rhs.h <- paste('log2(exp(clus', 1:num.clusters, '.five.twenty$avg_logFC))', sep="")
# commands.h <- paste(paste(lhs.h, rhs.h, sep="<-"), collapse=";")
# eval(parse(text=commands.h))
lhs.j <- paste("clus", 1:num.clusters, ".five.twenty$Gene", sep="")
rhs.j <- paste('row.names(clus', 1:num.clusters, '.five.twenty)', sep="")
commands.j <- paste(paste(lhs.j, rhs.j, sep="<-"), collapse=";")
eval(parse(text=commands.j))
lhs.k <- paste("clus", 1:num.clusters, '.five.twenty$diffexpressed <- "other genes"', sep="")
commands.k <- paste(lhs.k , sep=";")
eval(parse(text=commands.k))
lhs.l <- paste('clus', 1:num.clusters,'.five.twenty$diffexpressed[clus',1:num.clusters,'.five.twenty$avg_log2FC >= 0.584962501 & clus', 1:num.clusters,'.five.twenty$p_val_adj <= 0.05] <- "up in 5% O2"', sep="")
commands.l <- paste(lhs.l , sep=";")
eval(parse(text=commands.l))
lhs.m <- paste('clus', 1:num.clusters,'.five.twenty$diffexpressed[clus',1:num.clusters,'.five.twenty$avg_log2FC <= -0.584962501 & clus', 1:num.clusters,'.five.twenty$p_val_adj <= 0.05] <- "up in 20% O2"', sep="")
commands.m <- paste(lhs.m , sep=";")
eval(parse(text=commands.m))
lhs.n <- paste('clus', 1:num.clusters,'.five.twenty$delabel <- ""', sep="")
commands.n <- paste(lhs.n , sep=";")
eval(parse(text=commands.n))
lhs.o <- paste('clus', 1:num.clusters,'.five.twenty$delabel[clus', 1:num.clusters,'.five.twenty$avg_log2FC >= 0.584962501 & clus', 1:num.clusters,'.five.twenty$p_val_adj <= 0.05]', sep="")
rhs.o <- paste('clus', 1:num.clusters,'.five.twenty$Gene[clus', 1:num.clusters,'.five.twenty$avg_log2FC >= 0.584962501 & clus', 1:num.clusters,'.five.twenty$p_val_adj <= 0.05]', sep="")
commands.o <- paste(paste(lhs.o, rhs.o, sep="<-"), collapse=";")
eval(parse(text=commands.o))
lhs.p <- paste('clus', 1:num.clusters,'.five.twenty$delabel[clus', 1:num.clusters,'.five.twenty$avg_log2FC <= -0.584962501 & clus', 1:num.clusters,'.five.twenty$p_val_adj <= 0.05]', sep="")
rhs.p <- paste('clus', 1:num.clusters,'.five.twenty$Gene[clus', 1:num.clusters,'.five.twenty$avg_log2FC <= -0.584962501 & clus', 1:num.clusters,'.five.twenty$p_val_adj <= 0.05]', sep="")
commands.p <- paste(paste(lhs.p, rhs.p, sep="<-"), collapse=";")
eval(parse(text=commands.p))Volcano Plot function: This plots allows us to visualize the genes that are overexpressed in each condition along with its p-value. First, we will setup a function to make a volcano plot and then we call them for each cluster and depict them as an interactive plot.
myVolcanoPlot <- function(mydf, clus.number) {
de <- ggplot(data=mydf, aes(x=avg_log2FC, y=-log10(p_val), col=diffexpressed, label=delabel)) +
geom_point(alpha = 0.5) +
theme_classic() +
scale_color_manual(name = "Expression", values=c("#4d4d4d", "#ca0020", "#0571b0")) +
ggtitle(paste("Cluster ", clus.number, ": 20% O2 vs. 5% O2")) +
xlab("Log2 fold change") +
ylab("-log10 pvalue (adjusted)") +
theme(legend.text.align = 0)
return(de)
}Volcano plot for cluster 1
ggplotly(myVolcanoPlot(clus1.five.twenty, 1))Fig.6-1: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 1
Volcano plot for cluster 2
ggplotly(myVolcanoPlot(clus2.five.twenty, 2))Fig.6-2: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 2
ggsave("Figure_6_A.svg", dpi=900, width = 8, height = 6)Volcano plot for cluster 3
ggplotly(myVolcanoPlot(clus3.five.twenty, 3))Fig.6-3: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 3
Volcano plot for cluster 4
ggplotly(myVolcanoPlot(clus4.five.twenty, 4))Fig.6-4: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 4
Volcano plot for cluster 5
ggplotly(myVolcanoPlot(clus5.five.twenty, 5))Fig.6-5: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 5
Volcano plot for cluster 6
ggplotly(myVolcanoPlot(clus6.five.twenty, 6))Fig.6-6: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 6
Volcano plot for cluster 7
ggplotly(myVolcanoPlot(clus7.five.twenty, 7))Fig.6-7: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 7
Volcano plot for cluster 8
ggplotly(myVolcanoPlot(clus8.five.twenty, 8))Fig.6-8: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 8
Volcano plot for cluster 9
ggplotly(myVolcanoPlot(clus9.five.twenty, 9))Fig.6-9: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 9
Volcano plot for cluster 10
ggplotly(myVolcanoPlot(clus10.five.twenty, 10))Fig.6-10: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 10
Volcano plot for cluster 11
ggplotly(myVolcanoPlot(clus11.five.twenty, 11))Fig.6-11: Interactive Volcano plot showing genes upregulated in 5% and 20% oxygen conditions for cluster 11
Figures from the publication
Prepare dataset for individual plots
DefaultAssay(bapd8.integrated) <- "RNA"
Idents(bapd8.integrated) <- "new_clusters"
cluster4n8 <- subset(bapd8.integrated, idents = c("4", "8"))
#cluster2356 <- subset(bapd8.integrated, idents = c("1", "3", "4", "8"))
#cluster2n3 <- subset(bapd8.integrated, idents = c("2", "3"))
Idents(cluster4n8) <- "new_clusters"
#Idents(cluster2356) <- "new_clusters"
#Idents(cluster2n3) <- "new_clusters"
#DefaultAssay(cluster2356) <- "RNA"
DefaultAssay(cluster4n8) <- "RNA"
#DefaultAssay(cluster2n3) <- "RNA"Figure 3
Transcription factors
fig3a <- c("GATA3", "TFAP2A")
multi_dittoPlot(bapd8.integrated, vars = fig3a, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 2, color.panel = color_list)Fig3A: Violin plots showing expression (average log fold change) for genes encoding transcription factors
Structural proteins
fig3b <- c("KRT7", "KRT23")
multi_dittoPlot(bapd8.integrated, vars = fig3b, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 2, color.panel = color_list)Fig3B: Violin plots showing expression (average log fold change) for genes encoding Structural proteins
Hormones
fig3c <- c("CGA", "PGF")
multi_dittoPlot(bapd8.integrated, vars = fig3c, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 2, color.panel = color_list)Fig3C: Violin plots showing expression (average log fold change) for genes encoding Hormones
Transporters and Carcinoembryonic Antigen
fig3d <- c("SLC40A1", "XAGE2")
multi_dittoPlot(bapd8.integrated, vars = fig3d, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 2, color.panel = color_list)Fig3D: Violin plots showing expression (average log fold change) for genes encoding Transporters and Carcinoembryonic Antigen
Enzymes
fig3e <- c("CYP11A1", "HSD3B1")
multi_dittoPlot(bapd8.integrated, vars = fig3e, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 2, color.panel = color_list)Fig3E: Violin plots showing expression (average log fold change) for genes encoding enzymes
Long non-coding RNAs
fig3f <- c("MALAT1", "NEAT1")
multi_dittoPlot(bapd8.integrated, vars = fig3f, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 2, color.panel = color_list)Fig3F: Violin plots showing expression (average log fold change) for genes encoding lncRNAs
Cluster 4 and 8
# PCE cluster 4
inputGenes<-toupper(markers.filtered.names.4)
humanGene<-humanGeneMapping[humanGeneMapping$Gene.name %in% inputGenes,]
inputGenes<-humanGene$Gene
expressionData<-data[intersect(row.names(data),humanGeneMapping$Gene),]
se<-SummarizedExperiment(assays = SimpleList(as.matrix(expressionData)),rowData = row.names(expressionData),colData = colnames(expressionData))
cellSpecificGenesExp<-teGeneRetrieval(se,expressedGeneThreshold = 1)
print(length(inputGenes))
#> [1] 606
gs<-GeneSet(geneIds=toupper(inputGenes))
output2<-teEnrichmentCustom(gs,cellSpecificGenesExp)
enrichmentOutput<-setNames(data.frame(assay(output2[[1]]),row.names = rowData(output2[[1]])[,1]),colData(output2[[1]])[,1])
row.names(cellDetails)<-cellDetails$RName
enrichmentOutput$Tissue<- cellDetails[row.names(enrichmentOutput),"CellName"]
sct.cluster4.genes <- as.data.frame(assay(output2[[2]][["SCT"]]))[1]
sct.cluster4.genes <- humanGeneMapping[humanGeneMapping$Gene %in% as.list(sct.cluster4.genes$Gene),]
sct.cluster4.genes <- sct.cluster4.genes$Gene.name
# PCE cluster 8
inputGenes<-toupper(markers.filtered.names.8)
humanGene<-humanGeneMapping[humanGeneMapping$Gene.name %in% inputGenes,]
inputGenes<-humanGene$Gene
expressionData<-data[intersect(row.names(data),humanGeneMapping$Gene),]
se<-SummarizedExperiment(assays = SimpleList(as.matrix(expressionData)),rowData = row.names(expressionData),colData = colnames(expressionData))
cellSpecificGenesExp<-teGeneRetrieval(se,expressedGeneThreshold = 1)
print(length(inputGenes))
#> [1] 437
gs<-GeneSet(geneIds=toupper(inputGenes))
output2<-teEnrichmentCustom(gs,cellSpecificGenesExp)
enrichmentOutput<-setNames(data.frame(assay(output2[[1]]),row.names = rowData(output2[[1]])[,1]),colData(output2[[1]])[,1])
row.names(cellDetails)<-cellDetails$RName
enrichmentOutput$Tissue<- cellDetails[row.names(enrichmentOutput),"CellName"]
sct.cluster8.genes <- as.data.frame(assay(output2[[2]][["SCT"]]))[1]
sct.cluster8.genes <- humanGeneMapping[humanGeneMapping$Gene %in% as.list(sct.cluster8.genes$Gene),]
sct.cluster8.genes <- sct.cluster8.genes$Gene.name
x = list(sct.cluster4.genes, sct.cluster8.genes)names(x) <- c("STB genes of Cluster 4","STB genes of Cluster 8")
ggvenn(
x,
fill_color = c(color_list[4],color_list[8]),
stroke_size = NA,
set_name_size = 4,
show_percentage = FALSE
)
#> Warning in sprintf("%d", n, 100 * n/sum(n)): one argument not used by format
#> '%d'Fig5A: STB specific genes shared by clusters 5 and 6
Cluster 4 and 8 (with new marker genes)
# PCE cluster 4
inputGenes<-toupper(markers.filtered.names.4vs8)
humanGene<-humanGeneMapping[humanGeneMapping$Gene.name %in% inputGenes,]
inputGenes<-humanGene$Gene
expressionData<-data[intersect(row.names(data),humanGeneMapping$Gene),]
se<-SummarizedExperiment(assays = SimpleList(as.matrix(expressionData)),rowData = row.names(expressionData),colData = colnames(expressionData))
cellSpecificGenesExp<-teGeneRetrieval(se,expressedGeneThreshold = 1)
print(length(inputGenes))
#> [1] 1278
gs<-GeneSet(geneIds=toupper(inputGenes))
output2<-teEnrichmentCustom(gs,cellSpecificGenesExp)
enrichmentOutput<-setNames(data.frame(assay(output2[[1]]),row.names = rowData(output2[[1]])[,1]),colData(output2[[1]])[,1])
row.names(cellDetails)<-cellDetails$RName
enrichmentOutput$Tissue<- cellDetails[row.names(enrichmentOutput),"CellName"]
sct.cluster4vs8.genes <- as.data.frame(assay(output2[[2]][["SCT"]]))[1]
sct.cluster4vs8.genes <- humanGeneMapping[humanGeneMapping$Gene %in% as.list(sct.cluster4vs8.genes$Gene),]
sct.cluster4vs8.genes <- sct.cluster4vs8.genes$Gene.name
# PCE cluster 8
inputGenes<-toupper(markers.filtered.names.8vs4)
humanGene<-humanGeneMapping[humanGeneMapping$Gene.name %in% inputGenes,]
inputGenes<-humanGene$Gene
expressionData<-data[intersect(row.names(data),humanGeneMapping$Gene),]
se<-SummarizedExperiment(assays = SimpleList(as.matrix(expressionData)),rowData = row.names(expressionData),colData = colnames(expressionData))
cellSpecificGenesExp<-teGeneRetrieval(se,expressedGeneThreshold = 1)
print(length(inputGenes))
#> [1] 411
gs<-GeneSet(geneIds=toupper(inputGenes))
output2<-teEnrichmentCustom(gs,cellSpecificGenesExp)
enrichmentOutput<-setNames(data.frame(assay(output2[[1]]),row.names = rowData(output2[[1]])[,1]),colData(output2[[1]])[,1])
row.names(cellDetails)<-cellDetails$RName
enrichmentOutput$Tissue<- cellDetails[row.names(enrichmentOutput),"CellName"]
sct.cluster8vs4.genes <- as.data.frame(assay(output2[[2]][["SCT"]]))[1]
sct.cluster8vs4.genes <- humanGeneMapping[humanGeneMapping$Gene %in% as.list(sct.cluster8vs4.genes$Gene),]
sct.cluster8vs4.genes <- sct.cluster8vs4.genes$Gene.name
x = list(sct.cluster4vs8.genes, sct.cluster8vs4.genes)names(x) <- c("STB genes of Cluster 4 (vs.8)","STB genes of Cluster 8 (vs.4)")
ggvenn(
x,
fill_color = c(color_list[4],color_list[8]),
stroke_size = NA,
set_name_size = 4,
show_percentage = FALSE
)
#> Warning in sprintf("%d", n, 100 * n/sum(n)): one argument not used by format
#> '%d'Fig5B: STB specific genes shared by clusters 4 and 8 (new)
fig5a <- c("KRT8", "S100P", "XAGE2")
fig5b <- c("ERVV-1", "TBX3", "GRHL1")
multi_dittoPlot(cluster4n8, vars = fig5a, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 1, color.panel = c(color_list[5:6]))
multi_dittoPlot(cluster4n8, vars = fig5b, group.by = "new_clusters",
vlnplot.lineweight = 0.2, jitter.size = 0.3, ncol = 1, color.panel = c(color_list[5:6]))Fig5: STB specific genes shared by clusters 5 and 6. Some highly expressed STB specific genes show (A) higher expression in cluster 5 and (B) higher expression in cluster 6
Save RDS file
Finally we will save the entire session data to an external file. This can be explored again by loading it in R in the future if there is any need.
saveRDS(bapd8.integrated, 'bapd8.integrated.rds')Session Info
Complete session information
sessionInfo()
#> R version 4.1.0 (2021-05-18)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.2 LTS
#>
#> Matrix products: default
#> BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] grid stats4 stats graphics grDevices utils datasets
#> [8] methods base
#>
#> other attached packages:
#> [1] SeuratWrappers_0.3.0 enrichR_3.0
#> [3] ape_5.6-1 DT_0.20
#> [5] plotly_4.10.0 ggvenn_0.1.9
#> [7] scales_1.1.1 ComplexHeatmap_2.10.0
#> [9] dittoSeq_1.6.0 ggrepel_0.9.1
#> [11] calibrate_1.7.7 MASS_7.3-55
#> [13] enhancedDimPlot_0.0.0.9100 forcats_0.5.1
#> [15] purrr_0.3.4 readr_2.1.2
#> [17] tidyr_1.2.0 tibble_3.1.6
#> [19] tidyverse_1.3.1 gprofiler2_0.2.1
#> [21] TissueEnrich_1.14.0 GSEABase_1.56.0
#> [23] graph_1.72.0 annotate_1.72.0
#> [25] XML_3.99-0.8 AnnotationDbi_1.56.2
#> [27] SummarizedExperiment_1.24.0 GenomicRanges_1.46.1
#> [29] GenomeInfoDb_1.30.1 IRanges_2.28.0
#> [31] S4Vectors_0.32.3 MatrixGenerics_1.6.0
#> [33] matrixStats_0.61.0 ensurer_1.1
#> [35] stringr_1.4.0 dplyr_1.0.8
#> [37] gridExtra_2.3 multtest_2.50.0
#> [39] Biobase_2.54.0 BiocGenerics_0.40.0
#> [41] metap_1.8 patchwork_1.1.1
#> [43] cowplot_1.1.1 ggplot2_3.3.5
#> [45] kableExtra_1.3.4 knitr_1.37
#> [47] SeuratObject_4.0.4 Seurat_4.1.0
#>
#> loaded via a namespace (and not attached):
#> [1] SnowballC_0.7.0 scattermore_0.8
#> [3] R.methodsS3_1.8.1 ragg_1.2.1
#> [5] bit64_4.0.5 R.utils_2.11.0
#> [7] irlba_2.3.5 multcomp_1.4-18
#> [9] DelayedArray_0.20.0 data.table_1.14.2
#> [11] rpart_4.1.16 KEGGREST_1.34.0
#> [13] RCurl_1.98-1.6 doParallel_1.0.17
#> [15] generics_0.1.2 TH.data_1.1-0
#> [17] RSQLite_2.2.9 RANN_2.6.1
#> [19] future_1.23.0 tokenizers_0.2.1
#> [21] bit_4.0.4 tzdb_0.2.0
#> [23] mutoss_0.1-12 spatstat.data_2.1-2
#> [25] webshot_0.5.2 xml2_1.3.3
#> [27] lubridate_1.8.0 httpuv_1.6.5
#> [29] assertthat_0.2.1 xfun_0.29
#> [31] hms_1.1.1 jquerylib_0.1.4
#> [33] evaluate_0.14 promises_1.2.0.1
#> [35] fansi_1.0.2 dbplyr_2.1.1
#> [37] readxl_1.3.1 igraph_1.2.11
#> [39] DBI_1.1.2 tmvnsim_1.0-2
#> [41] htmlwidgets_1.5.4 spatstat.geom_2.3-2
#> [43] paletteer_1.4.0 ellipsis_0.3.2
#> [45] RSpectra_0.16-0 crosstalk_1.2.0
#> [47] backports_1.4.1 bookdown_0.24
#> [49] deldir_1.0-6 vctrs_0.3.8
#> [51] SingleCellExperiment_1.16.0 Cairo_1.5-14
#> [53] remotes_2.4.2 ROCR_1.0-11
#> [55] abind_1.4-5 cachem_1.0.6
#> [57] withr_2.4.3 sctransform_0.3.3
#> [59] goftest_1.2-3 mnormt_2.0.2
#> [61] svglite_2.1.0 cluster_2.1.2
#> [63] lazyeval_0.2.2 crayon_1.5.0
#> [65] labeling_0.4.2 pkgconfig_2.0.3
#> [67] qqconf_1.1.1 vipor_0.4.5
#> [69] nlme_3.1-155 rlang_1.0.1
#> [71] globals_0.14.0 lifecycle_1.0.1
#> [73] miniUI_0.1.1.1 sandwich_3.0-1
#> [75] mathjaxr_1.4-0
#> [ reached getOption("max.print") -- omitted 107 entries ]